# Importing Dataset

listing <- read.csv("C:/Users/lawye/Downloads/listings.csv", header = T)
calendar <- read.csv("C:/Users/lawye/Downloads/calendar.csv", header = T)
model <- lm(price ~ room_type+minimum_nights+number_of_reviews+reviews_per_month+calculated_host_listings_count+availability_365, data = listing)
summary(model)
## 
## Call:
## lm(formula = price ~ room_type + minimum_nights + number_of_reviews + 
##     reviews_per_month + calculated_host_listings_count + availability_365, 
##     data = listing)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -261.4  -84.7  -34.3   15.3 9933.2 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     203.12477    2.95291  68.788  < 2e-16 ***
## room_typePrivate room          -134.64909    3.09622 -43.488  < 2e-16 ***
## room_typeShared room           -200.48755    7.61709 -26.321  < 2e-16 ***
## minimum_nights                   -0.15554    0.06529  -2.382   0.0172 *  
## number_of_reviews                -0.22882    0.03134  -7.301 2.92e-13 ***
## reviews_per_month                -9.17536    0.77029 -11.912  < 2e-16 ***
## calculated_host_listings_count    0.76611    0.14082   5.440 5.35e-08 ***
## availability_365                  0.18005    0.01068  16.861  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 268.8 on 35229 degrees of freedom
##   (9267 observations deleted due to missingness)
## Multiple R-squared:  0.0763, Adjusted R-squared:  0.07611 
## F-statistic: 415.7 on 7 and 35229 DF,  p-value: < 2.2e-16
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.3
attach(listing)

# Price distribution by different types of a room
head(listing)
ggplot(listing, aes(x=price, fill=room_type)) + geom_density(alpha=0.5) + labs(x="Room Price",y="Density") + scale_fill_discrete(name = "Room Type") +ggtitle("Airbnb Room Price Distribution") + theme(plot.title = element_text(hjust = 0.5)) + xlim(c(0,500))
## Warning: Removed 2280 rows containing non-finite values (stat_density).

library(leaflet)
## Warning: package 'leaflet' was built under R version 3.4.4
# Geographical mapping of Airbnb dataset

airbnb_map <- leaflet(listing) %>% addTiles(attribution='Airbnb Dataset') %>% addMarkers(~longitude, ~latitude, popup = ~as.character(price), label = ~as.character(name), clusterOptions = markerClusterOptions())
airbnb_map
# Determine whether there is significant difference in room price between the rooms in Santa Catalina Island and the rooms outside of an island

indices <- grepl('two harbors|avalon|catalina', neighbourhood, ignore.case = T)
listing$Catalina <- ifelse(indices, 'Yes', 'No')

ggplot(aes(x=Catalina, y=price, fill=Catalina), data = listing) + geom_boxplot() +
  ylim(c(0,500)) + xlab("Santa Catalina Island") + ggtitle("Room Price in Catalina Island") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "IslandOrNot")
## Warning: Removed 2280 rows containing non-finite values (stat_boxplot).

listing %>% group_by(Catalina) %>% summarise(AvgPrice = mean(price))
ggplot(listing, aes(x=price, fill=Catalina)) + geom_density(alpha=0.5) + labs(x="Room Price",y="Density") + scale_fill_discrete(name = "Catalina Island") +ggtitle("Airbnb Room Price Distribution") + theme(plot.title = element_text(hjust = 0.5)) + xlim(c(0,1000))
## Warning: Removed 890 rows containing non-finite values (stat_density).

# Determine if Airbnb rooms near beach are generally more expensive than those that are distant from beach

indices2 <- grepl('beach', name, ignore.case = T)
listing$Beach <- ifelse(indices2, 'Yes', 'No')

ggplot(aes(x=Beach, y=price, fill=Beach), data = listing) + geom_boxplot() +
  ylim(c(0,500)) + xlab("Beach") + ggtitle("Room Price By Beach") + theme(plot.title = element_text(hjust = 0.5)) + scale_fill_discrete(name = "BeachOrNot")
## Warning: Removed 2280 rows containing non-finite values (stat_boxplot).

listing %>% group_by(Beach) %>% summarise(AvgPrice = mean(price))
# Divide Airbnb room availability into 5 different categorical variables and determine how much they affect
# the room price

listing$Availability <- cut(availability_365, breaks=c(0,20,72,146,220,294,365), labels = c("0-20", "21-72","73-146","147-220","221-294","295-365"))
table(listing$Availability, exclude = NULL)
## 
##    0-20   21-72  73-146 147-220 221-294 295-365    <NA> 
##    1757    4758    7026    5586    3171   14711    7495
new_list <- listing %>% group_by(Availability, Beach) %>% summarise(AvgPrice = floor(mean(price)))
## Warning: package 'bindrcpp' was built under R version 3.4.3
new_list <- new_list[1:(nrow(new_list)-2),]

p <- ggplot(new_list, aes(x = Availability, y = AvgPrice)) +
  geom_bar(
    aes(color = Beach, fill = Beach),
    stat = "identity", position = position_dodge(0.8),
    width = 0.7
    ) +
  scale_color_manual(values = c("#0073C2FF", "#EFC000FF")) +
  scale_fill_manual(values = c("#0073C2FF", "#EFC000FF")) + ylim(c(0,400)) + ggtitle("Airbnb Room Price By Availability and Beach") + theme(plot.title = element_text(hjust = 0.5))

p + geom_text(
  aes(label = AvgPrice, group = Beach), 
  position = position_dodge(0.8),
  vjust = -0.3, size = 3.5
)

# Measure how price of Airbnb rooms in Catalina islands are affected by availability

listing$Availability <- cut(availability_365, breaks=c(0,72,146,220,300,330,365), labels = c("0-72","73-146","147-220","221-300","301-330","331-365"))
table(listing$Availability, exclude = NULL)
## 
##    0-72  73-146 147-220 221-300 301-330 331-365    <NA> 
##    6515    7026    5586    3516    3046   11320    7495
new_list2 <- listing %>% group_by(Availability, Catalina) %>% summarise(AvgPrice = floor(mean(price)))
new_list2 <- new_list2[1:(nrow(new_list2)-2),]

p2 <- ggplot(new_list2, aes(x = Availability, y = AvgPrice)) +
  geom_bar(
    aes(color = Catalina, fill = Catalina),
    stat = "identity", position = position_dodge(0.8),
    width = 0.7
    ) +
  scale_color_manual(values = c("#0073C2FF", "#EFC000FF")) +
  scale_fill_manual(values = c("#0073C2FF", "#EFC000FF")) + ylim(c(0,700)) + ggtitle("Airbnb Room Price By Availability and Catalina") + theme(plot.title = element_text(hjust = 0.5))

p2 + geom_text(
  aes(label = AvgPrice, group = Catalina), 
  position = position_dodge(0.8),
  vjust = -0.8, size = 3.5
)